In [3]:
# standard
from IPython import embed
import pandas as pd
import numpy as np

# frameworks
from frameworks.seq2seq_keras.models import AttentionSeq2Seq
from gensim.models import Word2Vec

# custom
from data_utils import get_train_data
from word2vec import get_word_embedding
from vocab import get_vocab

Global Variables


In [7]:
_BATCH_SIZE = 64
_VOCAB_SIZE = 6000
_WORD_DIM = 128
_MODEL_DEPTH = 4

_INPUT_LENGTH = 25
_OUTPUT_LENGTH = 10

Model


In [3]:
model = AttentionSeq2Seq(input_length=_INPUT_LENGTH, 
                         input_dim=_WORD_DIM, 
                         hidden_dim=_WORD_DIM, 
                         output_length=_OUTPUT_LENGTH, 
                         output_dim=_WORD_DIM, 
                         depth=_MODEL_DEPTH)
model.compile(loss='mse', optimizer='rmsprop')

Data


In [8]:
embedding = get_word_embedding(_WORD_DIM)

In [7]:
train_data = get_train_data()
_, ch2int = get_vocab()

In [8]:
len(train_data)


Out[8]:
39956

In [9]:
def pad_to(lst, length, value):
    for i in range(len(lst), length):
        lst.append(value)
    
    return lst

def clean_train_data(train_data):
    X_train = []
    Y_train = []
    for idx in xrange(len(train_data)):
        line_number = idx % 4
        
        keyword = train_data[idx]['keyword']
        current_sentence = train_data[idx]['sentence']
        previous_sentences = ''.join([train_data[idx - i]['sentence'] for i in range(line_number, 0, -1)])
        
        X_entry = pad_to([[ch2int[ch]] for ch in (keyword + previous_sentences)], 25, [_VOCAB_SIZE - 1])
        Y_entry = pad_to([[ch2int[ch]] for ch in current_sentence], 10, [_VOCAB_SIZE - 1])
        
        X_train.append(X_entry)
        Y_train.append(Y_entry)
        
    return X_train, Y_train

In [10]:
X_train, Y_train = clean_train_data(train_data)

In [13]:
X_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in X_train]

In [14]:
Y_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in Y_train]

Training


In [15]:
model.fit(X_train_embedded, Y_train_embedded, epochs=1, verbose=1)


Epoch 1/1
39956/39956 [==============================] - 421s - loss: 0.4278   
Out[15]:
<keras.callbacks.History at 0x7f834e02d4d0>

Generation


In [16]:
kw = u'山水'

In [17]:
kw_pad = [pad_to([[ch2int[ch]] for ch in kw], 25, [_VOCAB_SIZE - 1])]

In [18]:
kw_embed = [map(lambda x: embedding[x[0]], sample) for sample in kw_pad]

In [19]:
kw_embed_array = np.array(kw_embed)

In [20]:
pred = model.predict(kw_embed_array)
pred


Out[20]:
array([[[-0.08963315,  0.13422257, -0.24570994, ..., -0.14995289,
          0.17542832,  0.13692029],
        [-0.03718845,  0.30552149, -0.17106384, ..., -0.05145008,
          0.19576812,  0.19635646],
        [-0.08512392,  0.32059655, -0.13894799, ..., -0.04305276,
          0.11034762,  0.05403135],
        ..., 
        [ 0.82171226,  0.54636025,  0.8892892 , ..., -0.24688512,
         -0.63101834,  0.0330935 ],
        [ 0.84401846,  0.55392796,  0.92472196, ..., -0.2636658 ,
         -0.64143229,  0.04308706],
        [ 0.84558034,  0.54933226,  0.92686731, ..., -0.2670286 ,
         -0.64220965,  0.04647564]]], dtype=float32)

In [21]:
w2v_model = Word2Vec.load('data/word2vec.model')

In [22]:
result = []
for i in range(len(pred[0])):
    result.append(w2v_model.most_similar(positive=[pred[0][i]], topn=1))

In [4]:
for r in result:
    print r[0][0]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-f62339b98eeb> in <module>()
----> 1 for r in result:
      2     print r[0][0]

NameError: name 'result' is not defined

In [ ]: